

<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <meta name="Description" content="scikit-learn: machine learning in Python">

  
  <title>6. Dataset transformations &mdash; scikit-learn 0.22 documentation</title>
  
  <link rel="canonical" href="http://scikit-learn.org/stable/data_transforms.html" />

  
  <link rel="shortcut icon" href="_static/favicon.ico"/>
  

  <link rel="stylesheet" href="_static/css/vendor/bootstrap.min.css" type="text/css" />
  <link rel="stylesheet" href="_static/gallery.css" type="text/css" />
  <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
<script id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
<script src="_static/jquery.js"></script> 
</head>
<body>
<nav id="navbar" class="sk-docs-navbar navbar navbar-expand-md navbar-light bg-light py-0">
  <div class="container-fluid sk-docs-container px-0">
      <a class="navbar-brand py-0" href="index.html">
        <img
          class="sk-brand-img"
          src="_static/scikit-learn-logo-small.png"
          alt="logo"/>
      </a>
    <button
      id="sk-navbar-toggler"
      class="navbar-toggler"
      type="button"
      data-toggle="collapse"
      data-target="#navbarSupportedContent"
      aria-controls="navbarSupportedContent"
      aria-expanded="false"
      aria-label="Toggle navigation"
    >
      <span class="navbar-toggler-icon"></span>
    </button>

    <div class="sk-navbar-collapse collapse navbar-collapse" id="navbarSupportedContent">
      <ul class="navbar-nav mr-auto">
        <li class="nav-item">
          <a class="sk-nav-link nav-link" href="install.html">Install</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link" href="user_guide.html">User Guide</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link" href="modules/classes.html">API</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link" href="auto_examples/index.html">Examples</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="getting_started.html">Getting Started</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="tutorial/index.html">Tutorial</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="glossary.html">Glossary</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="developers/index.html">Development</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="faq.html">FAQ</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="related_projects.html">Related packages</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="roadmap.html">Roadmap</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="about.html">About us</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="https://github.com/scikit-learn/scikit-learn">GitHub</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="https://scikit-learn.org/dev/versions.html">Other Versions</a>
        </li>
        <li class="nav-item dropdown nav-more-item-dropdown">
          <a class="sk-nav-link nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
          <div class="dropdown-menu" aria-labelledby="navbarDropdown">
              <a class="sk-nav-dropdown-item dropdown-item" href="getting_started.html">Getting Started</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="tutorial/index.html">Tutorial</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="glossary.html">Glossary</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="developers/index.html">Development</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="faq.html">FAQ</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="related_projects.html">Related packages</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="roadmap.html">Roadmap</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="about.html">About us</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="https://github.com/scikit-learn/scikit-learn">GitHub</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="https://scikit-learn.org/dev/versions.html">Other Versions</a>
          </div>
        </li>
      </ul>
      <div id="searchbox" role="search">
          <div class="searchformwrapper">
          <form class="search" action="search.html" method="get">
            <input class="sk-search-text-input" type="text" name="q" aria-labelledby="searchlabel" />
            <input class="sk-search-text-btn" type="submit" value="Go" />
          </form>
          </div>
      </div>
    </div>
  </div>
</nav>
<div class="d-flex" id="sk-doc-wrapper">
    <input type="checkbox" name="sk-toggle-checkbox" id="sk-toggle-checkbox">
    <label id="sk-sidemenu-toggle" class="sk-btn-toggle-toc btn sk-btn-primary" for="sk-toggle-checkbox">Toggle Menu</label>
    <div id="sk-sidebar-wrapper" class="border-right">
      <div class="sk-sidebar-toc-wrapper">
        <div class="sk-sidebar-toc-logo">
          <a href="index.html">
            <img
              class="sk-brand-img"
              src="_static/scikit-learn-logo-small.png"
              alt="logo"/>
          </a>
        </div>
        <div class="btn-group w-100 mb-2" role="group" aria-label="rellinks">
            <a href="visualizations.html" role="button" class="btn sk-btn-rellink py-1" sk-rellink-tooltip="5. Visualizations">Prev</a><a href="user_guide.html" role="button" class="btn sk-btn-rellink py-1" sk-rellink-tooltip="User Guide">Up</a>
            <a href="modules/compose.html" role="button" class="btn sk-btn-rellink py-1" sk-rellink-tooltip="6.1. Pipelines and composite estimators">Next</a>
        </div>
        <div class="alert alert-danger p-1 mb-2" role="alert">
          <p class="text-center mb-0">
          <strong>scikit-learn 0.22</strong><br/>
          <a href="http://scikit-learn.org/dev/versions.html">Other versions</a>
          </p>
        </div>
        <div class="alert alert-warning p-1 mb-2" role="alert">
          <p class="text-center mb-0">
            Please <a class="font-weight-bold" href="about.html#citing-scikit-learn"><string>cite us</string></a> if you use the software.
          </p>
        </div>
          <div class="sk-sidebar-toc">
            <ul>
<li><a class="reference internal" href="#">6. Dataset transformations</a></li>
</ul>

          </div>
      </div>
    </div>
    <div id="sk-page-content-wrapper">
      <div class="sk-page-content container-fluid body px-md-3" role="main">
        
  <style type="text/css">
  div.body div.toctree-wrapper ul {
      padding-left: 0;
  }

  div.body li.toctree-l1 {
      padding: 0 0 0.5em 0;
      list-style-type: none;
      font-size: 150%;
      font-weight: bold;
  }

  div.body li.toctree-l2 {
      font-size: 70%;
      list-style-type: square;
      font-weight: normal;
      margin-left: 40px;
  }

  div.body li.toctree-l3 {
      font-size: 85%;
      list-style-type: circle;
      font-weight: normal;
      margin-left: 40px;
  }

  div.body li.toctree-l4 {
      margin-left: 40px;
  }

</style><div class="section" id="dataset-transformations">
<span id="data-transforms"></span><h1>6. Dataset transformations<a class="headerlink" href="#dataset-transformations" title="Permalink to this headline">¶</a></h1>
<p>scikit-learn provides a library of transformers, which may clean (see
<a class="reference internal" href="modules/preprocessing.html#preprocessing"><span class="std std-ref">Preprocessing data</span></a>), reduce (see <a class="reference internal" href="modules/unsupervised_reduction.html#data-reduction"><span class="std std-ref">Unsupervised dimensionality reduction</span></a>), expand (see
<a class="reference internal" href="modules/kernel_approximation.html#kernel-approximation"><span class="std std-ref">Kernel Approximation</span></a>) or generate (see <a class="reference internal" href="modules/feature_extraction.html#feature-extraction"><span class="std std-ref">Feature extraction</span></a>)
feature representations.</p>
<p>Like other estimators, these are represented by classes with a <code class="docutils literal notranslate"><span class="pre">fit</span></code> method,
which learns model parameters (e.g. mean and standard deviation for
normalization) from a training set, and a <code class="docutils literal notranslate"><span class="pre">transform</span></code> method which applies
this transformation model to unseen data. <code class="docutils literal notranslate"><span class="pre">fit_transform</span></code> may be more
convenient and efficient for modelling and transforming the training data
simultaneously.</p>
<p>Combining such transformers, either in parallel or series is covered in
<a class="reference internal" href="modules/compose.html#combining-estimators"><span class="std std-ref">Pipelines and composite estimators</span></a>. <a class="reference internal" href="modules/metrics.html#metrics"><span class="std std-ref">Pairwise metrics, Affinities and Kernels</span></a> covers transforming feature
spaces into affinity matrices, while <a class="reference internal" href="modules/preprocessing_targets.html#preprocessing-targets"><span class="std std-ref">Transforming the prediction target (y)</span></a> considers
transformations of the target space (e.g. categorical labels) for use in
scikit-learn.</p>
<div class="toctree-wrapper compound">
<ul>
<li class="toctree-l1"><a class="reference internal" href="modules/compose.html">6.1. Pipelines and composite estimators</a><ul>
<li class="toctree-l2"><a class="reference internal" href="modules/compose.html#pipeline-chaining-estimators">6.1.1. Pipeline: chaining estimators</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/compose.html#transforming-target-in-regression">6.1.2. Transforming target in regression</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/compose.html#featureunion-composite-feature-spaces">6.1.3. FeatureUnion: composite feature spaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/compose.html#columntransformer-for-heterogeneous-data">6.1.4. ColumnTransformer for heterogeneous data</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="modules/feature_extraction.html">6.2. Feature extraction</a><ul>
<li class="toctree-l2"><a class="reference internal" href="modules/feature_extraction.html#loading-features-from-dicts">6.2.1. Loading features from dicts</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/feature_extraction.html#feature-hashing">6.2.2. Feature hashing</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/feature_extraction.html#text-feature-extraction">6.2.3. Text feature extraction</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/feature_extraction.html#image-feature-extraction">6.2.4. Image feature extraction</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="modules/preprocessing.html">6.3. Preprocessing data</a><ul>
<li class="toctree-l2"><a class="reference internal" href="modules/preprocessing.html#standardization-or-mean-removal-and-variance-scaling">6.3.1. Standardization, or mean removal and variance scaling</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/preprocessing.html#non-linear-transformation">6.3.2. Non-linear transformation</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/preprocessing.html#normalization">6.3.3. Normalization</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/preprocessing.html#encoding-categorical-features">6.3.4. Encoding categorical features</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/preprocessing.html#discretization">6.3.5. Discretization</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/preprocessing.html#imputation-of-missing-values">6.3.6. Imputation of missing values</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/preprocessing.html#generating-polynomial-features">6.3.7. Generating polynomial features</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/preprocessing.html#custom-transformers">6.3.8. Custom transformers</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="modules/impute.html">6.4. Imputation of missing values</a><ul>
<li class="toctree-l2"><a class="reference internal" href="modules/impute.html#univariate-vs-multivariate-imputation">6.4.1. Univariate vs. Multivariate Imputation</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/impute.html#univariate-feature-imputation">6.4.2. Univariate feature imputation</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/impute.html#multivariate-feature-imputation">6.4.3. Multivariate feature imputation</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/impute.html#references">6.4.4. References</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/impute.html#nearest-neighbors-imputation">6.4.5. Nearest neighbors imputation</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/impute.html#marking-imputed-values">6.4.6. Marking imputed values</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="modules/unsupervised_reduction.html">6.5. Unsupervised dimensionality reduction</a><ul>
<li class="toctree-l2"><a class="reference internal" href="modules/unsupervised_reduction.html#pca-principal-component-analysis">6.5.1. PCA: principal component analysis</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/unsupervised_reduction.html#random-projections">6.5.2. Random projections</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/unsupervised_reduction.html#feature-agglomeration">6.5.3. Feature agglomeration</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="modules/random_projection.html">6.6. Random Projection</a><ul>
<li class="toctree-l2"><a class="reference internal" href="modules/random_projection.html#the-johnson-lindenstrauss-lemma">6.6.1. The Johnson-Lindenstrauss lemma</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/random_projection.html#gaussian-random-projection">6.6.2. Gaussian random projection</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/random_projection.html#sparse-random-projection">6.6.3. Sparse random projection</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="modules/kernel_approximation.html">6.7. Kernel Approximation</a><ul>
<li class="toctree-l2"><a class="reference internal" href="modules/kernel_approximation.html#nystroem-method-for-kernel-approximation">6.7.1. Nystroem Method for Kernel Approximation</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/kernel_approximation.html#radial-basis-function-kernel">6.7.2. Radial Basis Function Kernel</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/kernel_approximation.html#additive-chi-squared-kernel">6.7.3. Additive Chi Squared Kernel</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/kernel_approximation.html#skewed-chi-squared-kernel">6.7.4. Skewed Chi Squared Kernel</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/kernel_approximation.html#mathematical-details">6.7.5. Mathematical Details</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="modules/metrics.html">6.8. Pairwise metrics, Affinities and Kernels</a><ul>
<li class="toctree-l2"><a class="reference internal" href="modules/metrics.html#cosine-similarity">6.8.1. Cosine similarity</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/metrics.html#linear-kernel">6.8.2. Linear kernel</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/metrics.html#polynomial-kernel">6.8.3. Polynomial kernel</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/metrics.html#sigmoid-kernel">6.8.4. Sigmoid kernel</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/metrics.html#rbf-kernel">6.8.5. RBF kernel</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/metrics.html#laplacian-kernel">6.8.6. Laplacian kernel</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/metrics.html#chi-squared-kernel">6.8.7. Chi-squared kernel</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="modules/preprocessing_targets.html">6.9. Transforming the prediction target (<code class="docutils literal notranslate"><span class="pre">y</span></code>)</a><ul>
<li class="toctree-l2"><a class="reference internal" href="modules/preprocessing_targets.html#label-binarization">6.9.1. Label binarization</a></li>
<li class="toctree-l2"><a class="reference internal" href="modules/preprocessing_targets.html#label-encoding">6.9.2. Label encoding</a></li>
</ul>
</li>
</ul>
</div>
</div>


      </div>
    <div class="container">
      <footer class="sk-content-footer">
            &copy; 2007 - 2019, scikit-learn developers (BSD License).
          <a href="_sources/data_transforms.rst.txt" rel="nofollow">Show this page source</a>
      </footer>
    </div>
  </div>
</div>
<script src="_static/js/vendor/bootstrap.min.js"></script>

<script>
    window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;
    ga('create', 'UA-22606712-2', 'auto');
    ga('set', 'anonymizeIp', true);
    ga('send', 'pageview');
</script>
<script async src='https://www.google-analytics.com/analytics.js'></script>


<script>
$(document).ready(function() {
    /* Add a [>>>] button on the top-right corner of code samples to hide
     * the >>> and ... prompts and the output and thus make the code
     * copyable. */
    var div = $('.highlight-python .highlight,' +
                '.highlight-python3 .highlight,' +
                '.highlight-pycon .highlight,' +
		'.highlight-default .highlight')
    var pre = div.find('pre');

    // get the styles from the current theme
    pre.parent().parent().css('position', 'relative');
    var hide_text = 'Hide prompts and outputs';
    var show_text = 'Show prompts and outputs';

    // create and add the button to all the code blocks that contain >>>
    div.each(function(index) {
        var jthis = $(this);
        if (jthis.find('.gp').length > 0) {
            var button = $('<span class="copybutton">&gt;&gt;&gt;</span>');
            button.attr('title', hide_text);
            button.data('hidden', 'false');
            jthis.prepend(button);
        }
        // tracebacks (.gt) contain bare text elements that need to be
        // wrapped in a span to work with .nextUntil() (see later)
        jthis.find('pre:has(.gt)').contents().filter(function() {
            return ((this.nodeType == 3) && (this.data.trim().length > 0));
        }).wrap('<span>');
    });

    // define the behavior of the button when it's clicked
    $('.copybutton').click(function(e){
        e.preventDefault();
        var button = $(this);
        if (button.data('hidden') === 'false') {
            // hide the code output
            button.parent().find('.go, .gp, .gt').hide();
            button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden');
            button.css('text-decoration', 'line-through');
            button.attr('title', show_text);
            button.data('hidden', 'true');
        } else {
            // show the code output
            button.parent().find('.go, .gp, .gt').show();
            button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible');
            button.css('text-decoration', 'none');
            button.attr('title', hide_text);
            button.data('hidden', 'false');
        }
    });

	/*** Add permalink buttons next to glossary terms ***/
	$('dl.glossary > dt[id]').append(function() {
		return ('<a class="headerlink" href="#' +
			    this.getAttribute('id') +
			    '" title="Permalink to this term">¶</a>');
	});
  /*** Hide navbar when scrolling down ***/
  // Returns true when headerlink target matches hash in url
  (function() {
    hashTargetOnTop = function() {
        var hash = window.location.hash;
        if ( hash.length < 2 ) { return false; }

        var target = document.getElementById( hash.slice(1) );
        if ( target === null ) { return false; }

        var top = target.getBoundingClientRect().top;
        return (top < 2) && (top > -2);
    };

    // Hide navbar on load if hash target is on top
    var navBar = document.getElementById("navbar");
    var navBarToggler = document.getElementById("sk-navbar-toggler");
    var navBarHeightHidden = "-" + navBar.getBoundingClientRect().height + "px";
    var $window = $(window);

    hideNavBar = function() {
        navBar.style.top = navBarHeightHidden;
    };

    showNavBar = function() {
        navBar.style.top = "0";
    }

    if (hashTargetOnTop()) {
        hideNavBar()
    }

    var prevScrollpos = window.pageYOffset;
    hideOnScroll = function(lastScrollTop) {
        if (($window.width() < 768) && (navBarToggler.getAttribute("aria-expanded") === 'true')) {
            return;
        }
        if (lastScrollTop > 2 && (prevScrollpos <= lastScrollTop) || hashTargetOnTop()){
            hideNavBar()
        } else {
            showNavBar()
        }
        prevScrollpos = lastScrollTop;
    };

    /*** high preformance scroll event listener***/
    var raf = window.requestAnimationFrame ||
        window.webkitRequestAnimationFrame ||
        window.mozRequestAnimationFrame ||
        window.msRequestAnimationFrame ||
        window.oRequestAnimationFrame;
    var lastScrollTop = $window.scrollTop();

    if (raf) {
        loop();
    }

    function loop() {
        var scrollTop = $window.scrollTop();
        if (lastScrollTop === scrollTop) {
            raf(loop);
            return;
        } else {
            lastScrollTop = scrollTop;
            hideOnScroll(lastScrollTop);
            raf(loop);
        }
    }
  })();
});

</script>
    
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js"></script>
    
</body>
</html>